from gz_spider.items import GzSpiderItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class GuangzhouSpiderSpider(CrawlSpider):
    name = 'guangzhou_spider'
    allowed_domains = ['ghzyj.gz.gov.cn']
    start_urls = ['http://ghzyj.gz.gov.cn/ywpd/cxgh/cssj/index.html']

    rules = (
        # 翻页的url
        Rule(LinkExtractor(allow=r'http://ghzyj.gz.gov.cn/ywpd/cxgh/cssj/index_\d+\.html'), follow=True),
        # 详情的url
        Rule(LinkExtractor(allow=r'http://ghzyj.gz.gov.cn/ywpd/cxgh/cssj/content/post_\d+\.html'),
             callback='parse_detail', follow=False)
    )

    def parse_detail(self, response):
        '''详情页'''
        title = response.xpath('//div[@class="content"]//h1[@id="content_title"]/text()').get()
        published_at = response.xpath('//span[@class="date"]/b/text()').get()
        contents = response.xpath('//div[@class="content_article"]//text()').getall()
        content = ''.join(contents).strip().replace(r'\n', '').replace(r'\t', '').replace(r' ', '')
        item = GzSpiderItem(title=title, published_at=published_at, content=content)
        return item
